{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.\n",
      "/home/user/anaconda3/envs/wyf/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:1617: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be deprecated in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
      "  warnings.warn(\n",
      "Some weights of the model checkpoint at /home/user/wyf/train_siglip_from_scratch/outputs were not used when initializing SiglipModel: ['b', 't', 'text_model.embeddings.LayerNorm.bias', 'text_model.embeddings.LayerNorm.weight', 'text_model.embeddings.position_embeddings.weight', 'text_model.embeddings.token_type_embeddings.weight', 'text_model.embeddings.word_embeddings.weight', 'text_model.encoder.layer.0.attention.output.LayerNorm.bias', 'text_model.encoder.layer.0.attention.output.LayerNorm.weight', 'text_model.encoder.layer.0.attention.output.dense.bias', 'text_model.encoder.layer.0.attention.output.dense.weight', 'text_model.encoder.layer.0.attention.self.key.bias', 'text_model.encoder.layer.0.attention.self.key.weight', 'text_model.encoder.layer.0.attention.self.query.bias', 'text_model.encoder.layer.0.attention.self.query.weight', 'text_model.encoder.layer.0.attention.self.value.bias', 'text_model.encoder.layer.0.attention.self.value.weight', 'text_model.encoder.layer.0.intermediate.dense.bias', 'text_model.encoder.layer.0.intermediate.dense.weight', 'text_model.encoder.layer.0.output.LayerNorm.bias', 'text_model.encoder.layer.0.output.LayerNorm.weight', 'text_model.encoder.layer.0.output.dense.bias', 'text_model.encoder.layer.0.output.dense.weight', 'text_model.encoder.layer.1.attention.output.LayerNorm.bias', 'text_model.encoder.layer.1.attention.output.LayerNorm.weight', 'text_model.encoder.layer.1.attention.output.dense.bias', 'text_model.encoder.layer.1.attention.output.dense.weight', 'text_model.encoder.layer.1.attention.self.key.bias', 'text_model.encoder.layer.1.attention.self.key.weight', 'text_model.encoder.layer.1.attention.self.query.bias', 'text_model.encoder.layer.1.attention.self.query.weight', 'text_model.encoder.layer.1.attention.self.value.bias', 'text_model.encoder.layer.1.attention.self.value.weight', 'text_model.encoder.layer.1.intermediate.dense.bias', 'text_model.encoder.layer.1.intermediate.dense.weight', 'text_model.encoder.layer.1.output.LayerNorm.bias', 'text_model.encoder.layer.1.output.LayerNorm.weight', 'text_model.encoder.layer.1.output.dense.bias', 'text_model.encoder.layer.1.output.dense.weight', 'text_model.encoder.layer.10.attention.output.LayerNorm.bias', 'text_model.encoder.layer.10.attention.output.LayerNorm.weight', 'text_model.encoder.layer.10.attention.output.dense.bias', 'text_model.encoder.layer.10.attention.output.dense.weight', 'text_model.encoder.layer.10.attention.self.key.bias', 'text_model.encoder.layer.10.attention.self.key.weight', 'text_model.encoder.layer.10.attention.self.query.bias', 'text_model.encoder.layer.10.attention.self.query.weight', 'text_model.encoder.layer.10.attention.self.value.bias', 'text_model.encoder.layer.10.attention.self.value.weight', 'text_model.encoder.layer.10.intermediate.dense.bias', 'text_model.encoder.layer.10.intermediate.dense.weight', 'text_model.encoder.layer.10.output.LayerNorm.bias', 'text_model.encoder.layer.10.output.LayerNorm.weight', 'text_model.encoder.layer.10.output.dense.bias', 'text_model.encoder.layer.10.output.dense.weight', 'text_model.encoder.layer.11.attention.output.LayerNorm.bias', 'text_model.encoder.layer.11.attention.output.LayerNorm.weight', 'text_model.encoder.layer.11.attention.output.dense.bias', 'text_model.encoder.layer.11.attention.output.dense.weight', 'text_model.encoder.layer.11.attention.self.key.bias', 'text_model.encoder.layer.11.attention.self.key.weight', 'text_model.encoder.layer.11.attention.self.query.bias', 'text_model.encoder.layer.11.attention.self.query.weight', 'text_model.encoder.layer.11.attention.self.value.bias', 'text_model.encoder.layer.11.attention.self.value.weight', 'text_model.encoder.layer.11.intermediate.dense.bias', 'text_model.encoder.layer.11.intermediate.dense.weight', 'text_model.encoder.layer.11.output.LayerNorm.bias', 'text_model.encoder.layer.11.output.LayerNorm.weight', 'text_model.encoder.layer.11.output.dense.bias', 'text_model.encoder.layer.11.output.dense.weight', 'text_model.encoder.layer.2.attention.output.LayerNorm.bias', 'text_model.encoder.layer.2.attention.output.LayerNorm.weight', 'text_model.encoder.layer.2.attention.output.dense.bias', 'text_model.encoder.layer.2.attention.output.dense.weight', 'text_model.encoder.layer.2.attention.self.key.bias', 'text_model.encoder.layer.2.attention.self.key.weight', 'text_model.encoder.layer.2.attention.self.query.bias', 'text_model.encoder.layer.2.attention.self.query.weight', 'text_model.encoder.layer.2.attention.self.value.bias', 'text_model.encoder.layer.2.attention.self.value.weight', 'text_model.encoder.layer.2.intermediate.dense.bias', 'text_model.encoder.layer.2.intermediate.dense.weight', 'text_model.encoder.layer.2.output.LayerNorm.bias', 'text_model.encoder.layer.2.output.LayerNorm.weight', 'text_model.encoder.layer.2.output.dense.bias', 'text_model.encoder.layer.2.output.dense.weight', 'text_model.encoder.layer.3.attention.output.LayerNorm.bias', 'text_model.encoder.layer.3.attention.output.LayerNorm.weight', 'text_model.encoder.layer.3.attention.output.dense.bias', 'text_model.encoder.layer.3.attention.output.dense.weight', 'text_model.encoder.layer.3.attention.self.key.bias', 'text_model.encoder.layer.3.attention.self.key.weight', 'text_model.encoder.layer.3.attention.self.query.bias', 'text_model.encoder.layer.3.attention.self.query.weight', 'text_model.encoder.layer.3.attention.self.value.bias', 'text_model.encoder.layer.3.attention.self.value.weight', 'text_model.encoder.layer.3.intermediate.dense.bias', 'text_model.encoder.layer.3.intermediate.dense.weight', 'text_model.encoder.layer.3.output.LayerNorm.bias', 'text_model.encoder.layer.3.output.LayerNorm.weight', 'text_model.encoder.layer.3.output.dense.bias', 'text_model.encoder.layer.3.output.dense.weight', 'text_model.encoder.layer.4.attention.output.LayerNorm.bias', 'text_model.encoder.layer.4.attention.output.LayerNorm.weight', 'text_model.encoder.layer.4.attention.output.dense.bias', 'text_model.encoder.layer.4.attention.output.dense.weight', 'text_model.encoder.layer.4.attention.self.key.bias', 'text_model.encoder.layer.4.attention.self.key.weight', 'text_model.encoder.layer.4.attention.self.query.bias', 'text_model.encoder.layer.4.attention.self.query.weight', 'text_model.encoder.layer.4.attention.self.value.bias', 'text_model.encoder.layer.4.attention.self.value.weight', 'text_model.encoder.layer.4.intermediate.dense.bias', 'text_model.encoder.layer.4.intermediate.dense.weight', 'text_model.encoder.layer.4.output.LayerNorm.bias', 'text_model.encoder.layer.4.output.LayerNorm.weight', 'text_model.encoder.layer.4.output.dense.bias', 'text_model.encoder.layer.4.output.dense.weight', 'text_model.encoder.layer.5.attention.output.LayerNorm.bias', 'text_model.encoder.layer.5.attention.output.LayerNorm.weight', 'text_model.encoder.layer.5.attention.output.dense.bias', 'text_model.encoder.layer.5.attention.output.dense.weight', 'text_model.encoder.layer.5.attention.self.key.bias', 'text_model.encoder.layer.5.attention.self.key.weight', 'text_model.encoder.layer.5.attention.self.query.bias', 'text_model.encoder.layer.5.attention.self.query.weight', 'text_model.encoder.layer.5.attention.self.value.bias', 'text_model.encoder.layer.5.attention.self.value.weight', 'text_model.encoder.layer.5.intermediate.dense.bias', 'text_model.encoder.layer.5.intermediate.dense.weight', 'text_model.encoder.layer.5.output.LayerNorm.bias', 'text_model.encoder.layer.5.output.LayerNorm.weight', 'text_model.encoder.layer.5.output.dense.bias', 'text_model.encoder.layer.5.output.dense.weight', 'text_model.encoder.layer.6.attention.output.LayerNorm.bias', 'text_model.encoder.layer.6.attention.output.LayerNorm.weight', 'text_model.encoder.layer.6.attention.output.dense.bias', 'text_model.encoder.layer.6.attention.output.dense.weight', 'text_model.encoder.layer.6.attention.self.key.bias', 'text_model.encoder.layer.6.attention.self.key.weight', 'text_model.encoder.layer.6.attention.self.query.bias', 'text_model.encoder.layer.6.attention.self.query.weight', 'text_model.encoder.layer.6.attention.self.value.bias', 'text_model.encoder.layer.6.attention.self.value.weight', 'text_model.encoder.layer.6.intermediate.dense.bias', 'text_model.encoder.layer.6.intermediate.dense.weight', 'text_model.encoder.layer.6.output.LayerNorm.bias', 'text_model.encoder.layer.6.output.LayerNorm.weight', 'text_model.encoder.layer.6.output.dense.bias', 'text_model.encoder.layer.6.output.dense.weight', 'text_model.encoder.layer.7.attention.output.LayerNorm.bias', 'text_model.encoder.layer.7.attention.output.LayerNorm.weight', 'text_model.encoder.layer.7.attention.output.dense.bias', 'text_model.encoder.layer.7.attention.output.dense.weight', 'text_model.encoder.layer.7.attention.self.key.bias', 'text_model.encoder.layer.7.attention.self.key.weight', 'text_model.encoder.layer.7.attention.self.query.bias', 'text_model.encoder.layer.7.attention.self.query.weight', 'text_model.encoder.layer.7.attention.self.value.bias', 'text_model.encoder.layer.7.attention.self.value.weight', 'text_model.encoder.layer.7.intermediate.dense.bias', 'text_model.encoder.layer.7.intermediate.dense.weight', 'text_model.encoder.layer.7.output.LayerNorm.bias', 'text_model.encoder.layer.7.output.LayerNorm.weight', 'text_model.encoder.layer.7.output.dense.bias', 'text_model.encoder.layer.7.output.dense.weight', 'text_model.encoder.layer.8.attention.output.LayerNorm.bias', 'text_model.encoder.layer.8.attention.output.LayerNorm.weight', 'text_model.encoder.layer.8.attention.output.dense.bias', 'text_model.encoder.layer.8.attention.output.dense.weight', 'text_model.encoder.layer.8.attention.self.key.bias', 'text_model.encoder.layer.8.attention.self.key.weight', 'text_model.encoder.layer.8.attention.self.query.bias', 'text_model.encoder.layer.8.attention.self.query.weight', 'text_model.encoder.layer.8.attention.self.value.bias', 'text_model.encoder.layer.8.attention.self.value.weight', 'text_model.encoder.layer.8.intermediate.dense.bias', 'text_model.encoder.layer.8.intermediate.dense.weight', 'text_model.encoder.layer.8.output.LayerNorm.bias', 'text_model.encoder.layer.8.output.LayerNorm.weight', 'text_model.encoder.layer.8.output.dense.bias', 'text_model.encoder.layer.8.output.dense.weight', 'text_model.encoder.layer.9.attention.output.LayerNorm.bias', 'text_model.encoder.layer.9.attention.output.LayerNorm.weight', 'text_model.encoder.layer.9.attention.output.dense.bias', 'text_model.encoder.layer.9.attention.output.dense.weight', 'text_model.encoder.layer.9.attention.self.key.bias', 'text_model.encoder.layer.9.attention.self.key.weight', 'text_model.encoder.layer.9.attention.self.query.bias', 'text_model.encoder.layer.9.attention.self.query.weight', 'text_model.encoder.layer.9.attention.self.value.bias', 'text_model.encoder.layer.9.attention.self.value.weight', 'text_model.encoder.layer.9.intermediate.dense.bias', 'text_model.encoder.layer.9.intermediate.dense.weight', 'text_model.encoder.layer.9.output.LayerNorm.bias', 'text_model.encoder.layer.9.output.LayerNorm.weight', 'text_model.encoder.layer.9.output.dense.bias', 'text_model.encoder.layer.9.output.dense.weight', 'text_model.pooler.dense.bias', 'text_model.pooler.dense.weight', 'vision_model.embeddings.cls_token', 'vision_model.embeddings.patch_embeddings.projection.bias', 'vision_model.embeddings.patch_embeddings.projection.weight', 'vision_model.embeddings.position_embeddings', 'vision_model.encoder.layer.0.attention.attention.key.bias', 'vision_model.encoder.layer.0.attention.attention.key.weight', 'vision_model.encoder.layer.0.attention.attention.query.bias', 'vision_model.encoder.layer.0.attention.attention.query.weight', 'vision_model.encoder.layer.0.attention.attention.value.bias', 'vision_model.encoder.layer.0.attention.attention.value.weight', 'vision_model.encoder.layer.0.attention.output.dense.bias', 'vision_model.encoder.layer.0.attention.output.dense.weight', 'vision_model.encoder.layer.0.intermediate.dense.bias', 'vision_model.encoder.layer.0.intermediate.dense.weight', 'vision_model.encoder.layer.0.layernorm_after.bias', 'vision_model.encoder.layer.0.layernorm_after.weight', 'vision_model.encoder.layer.0.layernorm_before.bias', 'vision_model.encoder.layer.0.layernorm_before.weight', 'vision_model.encoder.layer.0.output.dense.bias', 'vision_model.encoder.layer.0.output.dense.weight', 'vision_model.encoder.layer.1.attention.attention.key.bias', 'vision_model.encoder.layer.1.attention.attention.key.weight', 'vision_model.encoder.layer.1.attention.attention.query.bias', 'vision_model.encoder.layer.1.attention.attention.query.weight', 'vision_model.encoder.layer.1.attention.attention.value.bias', 'vision_model.encoder.layer.1.attention.attention.value.weight', 'vision_model.encoder.layer.1.attention.output.dense.bias', 'vision_model.encoder.layer.1.attention.output.dense.weight', 'vision_model.encoder.layer.1.intermediate.dense.bias', 'vision_model.encoder.layer.1.intermediate.dense.weight', 'vision_model.encoder.layer.1.layernorm_after.bias', 'vision_model.encoder.layer.1.layernorm_after.weight', 'vision_model.encoder.layer.1.layernorm_before.bias', 'vision_model.encoder.layer.1.layernorm_before.weight', 'vision_model.encoder.layer.1.output.dense.bias', 'vision_model.encoder.layer.1.output.dense.weight', 'vision_model.encoder.layer.10.attention.attention.key.bias', 'vision_model.encoder.layer.10.attention.attention.key.weight', 'vision_model.encoder.layer.10.attention.attention.query.bias', 'vision_model.encoder.layer.10.attention.attention.query.weight', 'vision_model.encoder.layer.10.attention.attention.value.bias', 'vision_model.encoder.layer.10.attention.attention.value.weight', 'vision_model.encoder.layer.10.attention.output.dense.bias', 'vision_model.encoder.layer.10.attention.output.dense.weight', 'vision_model.encoder.layer.10.intermediate.dense.bias', 'vision_model.encoder.layer.10.intermediate.dense.weight', 'vision_model.encoder.layer.10.layernorm_after.bias', 'vision_model.encoder.layer.10.layernorm_after.weight', 'vision_model.encoder.layer.10.layernorm_before.bias', 'vision_model.encoder.layer.10.layernorm_before.weight', 'vision_model.encoder.layer.10.output.dense.bias', 'vision_model.encoder.layer.10.output.dense.weight', 'vision_model.encoder.layer.11.attention.attention.key.bias', 'vision_model.encoder.layer.11.attention.attention.key.weight', 'vision_model.encoder.layer.11.attention.attention.query.bias', 'vision_model.encoder.layer.11.attention.attention.query.weight', 'vision_model.encoder.layer.11.attention.attention.value.bias', 'vision_model.encoder.layer.11.attention.attention.value.weight', 'vision_model.encoder.layer.11.attention.output.dense.bias', 'vision_model.encoder.layer.11.attention.output.dense.weight', 'vision_model.encoder.layer.11.intermediate.dense.bias', 'vision_model.encoder.layer.11.intermediate.dense.weight', 'vision_model.encoder.layer.11.layernorm_after.bias', 'vision_model.encoder.layer.11.layernorm_after.weight', 'vision_model.encoder.layer.11.layernorm_before.bias', 'vision_model.encoder.layer.11.layernorm_before.weight', 'vision_model.encoder.layer.11.output.dense.bias', 'vision_model.encoder.layer.11.output.dense.weight', 'vision_model.encoder.layer.2.attention.attention.key.bias', 'vision_model.encoder.layer.2.attention.attention.key.weight', 'vision_model.encoder.layer.2.attention.attention.query.bias', 'vision_model.encoder.layer.2.attention.attention.query.weight', 'vision_model.encoder.layer.2.attention.attention.value.bias', 'vision_model.encoder.layer.2.attention.attention.value.weight', 'vision_model.encoder.layer.2.attention.output.dense.bias', 'vision_model.encoder.layer.2.attention.output.dense.weight', 'vision_model.encoder.layer.2.intermediate.dense.bias', 'vision_model.encoder.layer.2.intermediate.dense.weight', 'vision_model.encoder.layer.2.layernorm_after.bias', 'vision_model.encoder.layer.2.layernorm_after.weight', 'vision_model.encoder.layer.2.layernorm_before.bias', 'vision_model.encoder.layer.2.layernorm_before.weight', 'vision_model.encoder.layer.2.output.dense.bias', 'vision_model.encoder.layer.2.output.dense.weight', 'vision_model.encoder.layer.3.attention.attention.key.bias', 'vision_model.encoder.layer.3.attention.attention.key.weight', 'vision_model.encoder.layer.3.attention.attention.query.bias', 'vision_model.encoder.layer.3.attention.attention.query.weight', 'vision_model.encoder.layer.3.attention.attention.value.bias', 'vision_model.encoder.layer.3.attention.attention.value.weight', 'vision_model.encoder.layer.3.attention.output.dense.bias', 'vision_model.encoder.layer.3.attention.output.dense.weight', 'vision_model.encoder.layer.3.intermediate.dense.bias', 'vision_model.encoder.layer.3.intermediate.dense.weight', 'vision_model.encoder.layer.3.layernorm_after.bias', 'vision_model.encoder.layer.3.layernorm_after.weight', 'vision_model.encoder.layer.3.layernorm_before.bias', 'vision_model.encoder.layer.3.layernorm_before.weight', 'vision_model.encoder.layer.3.output.dense.bias', 'vision_model.encoder.layer.3.output.dense.weight', 'vision_model.encoder.layer.4.attention.attention.key.bias', 'vision_model.encoder.layer.4.attention.attention.key.weight', 'vision_model.encoder.layer.4.attention.attention.query.bias', 'vision_model.encoder.layer.4.attention.attention.query.weight', 'vision_model.encoder.layer.4.attention.attention.value.bias', 'vision_model.encoder.layer.4.attention.attention.value.weight', 'vision_model.encoder.layer.4.attention.output.dense.bias', 'vision_model.encoder.layer.4.attention.output.dense.weight', 'vision_model.encoder.layer.4.intermediate.dense.bias', 'vision_model.encoder.layer.4.intermediate.dense.weight', 'vision_model.encoder.layer.4.layernorm_after.bias', 'vision_model.encoder.layer.4.layernorm_after.weight', 'vision_model.encoder.layer.4.layernorm_before.bias', 'vision_model.encoder.layer.4.layernorm_before.weight', 'vision_model.encoder.layer.4.output.dense.bias', 'vision_model.encoder.layer.4.output.dense.weight', 'vision_model.encoder.layer.5.attention.attention.key.bias', 'vision_model.encoder.layer.5.attention.attention.key.weight', 'vision_model.encoder.layer.5.attention.attention.query.bias', 'vision_model.encoder.layer.5.attention.attention.query.weight', 'vision_model.encoder.layer.5.attention.attention.value.bias', 'vision_model.encoder.layer.5.attention.attention.value.weight', 'vision_model.encoder.layer.5.attention.output.dense.bias', 'vision_model.encoder.layer.5.attention.output.dense.weight', 'vision_model.encoder.layer.5.intermediate.dense.bias', 'vision_model.encoder.layer.5.intermediate.dense.weight', 'vision_model.encoder.layer.5.layernorm_after.bias', 'vision_model.encoder.layer.5.layernorm_after.weight', 'vision_model.encoder.layer.5.layernorm_before.bias', 'vision_model.encoder.layer.5.layernorm_before.weight', 'vision_model.encoder.layer.5.output.dense.bias', 'vision_model.encoder.layer.5.output.dense.weight', 'vision_model.encoder.layer.6.attention.attention.key.bias', 'vision_model.encoder.layer.6.attention.attention.key.weight', 'vision_model.encoder.layer.6.attention.attention.query.bias', 'vision_model.encoder.layer.6.attention.attention.query.weight', 'vision_model.encoder.layer.6.attention.attention.value.bias', 'vision_model.encoder.layer.6.attention.attention.value.weight', 'vision_model.encoder.layer.6.attention.output.dense.bias', 'vision_model.encoder.layer.6.attention.output.dense.weight', 'vision_model.encoder.layer.6.intermediate.dense.bias', 'vision_model.encoder.layer.6.intermediate.dense.weight', 'vision_model.encoder.layer.6.layernorm_after.bias', 'vision_model.encoder.layer.6.layernorm_after.weight', 'vision_model.encoder.layer.6.layernorm_before.bias', 'vision_model.encoder.layer.6.layernorm_before.weight', 'vision_model.encoder.layer.6.output.dense.bias', 'vision_model.encoder.layer.6.output.dense.weight', 'vision_model.encoder.layer.7.attention.attention.key.bias', 'vision_model.encoder.layer.7.attention.attention.key.weight', 'vision_model.encoder.layer.7.attention.attention.query.bias', 'vision_model.encoder.layer.7.attention.attention.query.weight', 'vision_model.encoder.layer.7.attention.attention.value.bias', 'vision_model.encoder.layer.7.attention.attention.value.weight', 'vision_model.encoder.layer.7.attention.output.dense.bias', 'vision_model.encoder.layer.7.attention.output.dense.weight', 'vision_model.encoder.layer.7.intermediate.dense.bias', 'vision_model.encoder.layer.7.intermediate.dense.weight', 'vision_model.encoder.layer.7.layernorm_after.bias', 'vision_model.encoder.layer.7.layernorm_after.weight', 'vision_model.encoder.layer.7.layernorm_before.bias', 'vision_model.encoder.layer.7.layernorm_before.weight', 'vision_model.encoder.layer.7.output.dense.bias', 'vision_model.encoder.layer.7.output.dense.weight', 'vision_model.encoder.layer.8.attention.attention.key.bias', 'vision_model.encoder.layer.8.attention.attention.key.weight', 'vision_model.encoder.layer.8.attention.attention.query.bias', 'vision_model.encoder.layer.8.attention.attention.query.weight', 'vision_model.encoder.layer.8.attention.attention.value.bias', 'vision_model.encoder.layer.8.attention.attention.value.weight', 'vision_model.encoder.layer.8.attention.output.dense.bias', 'vision_model.encoder.layer.8.attention.output.dense.weight', 'vision_model.encoder.layer.8.intermediate.dense.bias', 'vision_model.encoder.layer.8.intermediate.dense.weight', 'vision_model.encoder.layer.8.layernorm_after.bias', 'vision_model.encoder.layer.8.layernorm_after.weight', 'vision_model.encoder.layer.8.layernorm_before.bias', 'vision_model.encoder.layer.8.layernorm_before.weight', 'vision_model.encoder.layer.8.output.dense.bias', 'vision_model.encoder.layer.8.output.dense.weight', 'vision_model.encoder.layer.9.attention.attention.key.bias', 'vision_model.encoder.layer.9.attention.attention.key.weight', 'vision_model.encoder.layer.9.attention.attention.query.bias', 'vision_model.encoder.layer.9.attention.attention.query.weight', 'vision_model.encoder.layer.9.attention.attention.value.bias', 'vision_model.encoder.layer.9.attention.attention.value.weight', 'vision_model.encoder.layer.9.attention.output.dense.bias', 'vision_model.encoder.layer.9.attention.output.dense.weight', 'vision_model.encoder.layer.9.intermediate.dense.bias', 'vision_model.encoder.layer.9.intermediate.dense.weight', 'vision_model.encoder.layer.9.layernorm_after.bias', 'vision_model.encoder.layer.9.layernorm_after.weight', 'vision_model.encoder.layer.9.layernorm_before.bias', 'vision_model.encoder.layer.9.layernorm_before.weight', 'vision_model.encoder.layer.9.output.dense.bias', 'vision_model.encoder.layer.9.output.dense.weight', 'vision_model.layernorm.bias', 'vision_model.layernorm.weight', 'vision_model.pooler.dense.bias', 'vision_model.pooler.dense.weight']\n",
      "- This IS expected if you are initializing SiglipModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing SiglipModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "Some weights of SiglipModel were not initialized from the model checkpoint at /home/user/wyf/train_siglip_from_scratch/outputs and are newly initialized: ['logit_bias', 'logit_scale', 'text_model.embeddings.position_embedding.weight', 'text_model.embeddings.token_embedding.weight', 'text_model.encoder.layers.0.layer_norm1.bias', 'text_model.encoder.layers.0.layer_norm1.weight', 'text_model.encoder.layers.0.layer_norm2.bias', 'text_model.encoder.layers.0.layer_norm2.weight', 'text_model.encoder.layers.0.mlp.fc1.bias', 'text_model.encoder.layers.0.mlp.fc1.weight', 'text_model.encoder.layers.0.mlp.fc2.bias', 'text_model.encoder.layers.0.mlp.fc2.weight', 'text_model.encoder.layers.0.self_attn.k_proj.bias', 'text_model.encoder.layers.0.self_attn.k_proj.weight', 'text_model.encoder.layers.0.self_attn.out_proj.bias', 'text_model.encoder.layers.0.self_attn.out_proj.weight', 'text_model.encoder.layers.0.self_attn.q_proj.bias', 'text_model.encoder.layers.0.self_attn.q_proj.weight', 'text_model.encoder.layers.0.self_attn.v_proj.bias', 'text_model.encoder.layers.0.self_attn.v_proj.weight', 'text_model.encoder.layers.1.layer_norm1.bias', 'text_model.encoder.layers.1.layer_norm1.weight', 'text_model.encoder.layers.1.layer_norm2.bias', 'text_model.encoder.layers.1.layer_norm2.weight', 'text_model.encoder.layers.1.mlp.fc1.bias', 'text_model.encoder.layers.1.mlp.fc1.weight', 'text_model.encoder.layers.1.mlp.fc2.bias', 'text_model.encoder.layers.1.mlp.fc2.weight', 'text_model.encoder.layers.1.self_attn.k_proj.bias', 'text_model.encoder.layers.1.self_attn.k_proj.weight', 'text_model.encoder.layers.1.self_attn.out_proj.bias', 'text_model.encoder.layers.1.self_attn.out_proj.weight', 'text_model.encoder.layers.1.self_attn.q_proj.bias', 'text_model.encoder.layers.1.self_attn.q_proj.weight', 'text_model.encoder.layers.1.self_attn.v_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.weight', 'text_model.encoder.layers.10.layer_norm1.bias', 'text_model.encoder.layers.10.layer_norm1.weight', 'text_model.encoder.layers.10.layer_norm2.bias', 'text_model.encoder.layers.10.layer_norm2.weight', 'text_model.encoder.layers.10.mlp.fc1.bias', 'text_model.encoder.layers.10.mlp.fc1.weight', 'text_model.encoder.layers.10.mlp.fc2.bias', 'text_model.encoder.layers.10.mlp.fc2.weight', 'text_model.encoder.layers.10.self_attn.k_proj.bias', 'text_model.encoder.layers.10.self_attn.k_proj.weight', 'text_model.encoder.layers.10.self_attn.out_proj.bias', 'text_model.encoder.layers.10.self_attn.out_proj.weight', 'text_model.encoder.layers.10.self_attn.q_proj.bias', 'text_model.encoder.layers.10.self_attn.q_proj.weight', 'text_model.encoder.layers.10.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.v_proj.weight', 'text_model.encoder.layers.11.layer_norm1.bias', 'text_model.encoder.layers.11.layer_norm1.weight', 'text_model.encoder.layers.11.layer_norm2.bias', 'text_model.encoder.layers.11.layer_norm2.weight', 'text_model.encoder.layers.11.mlp.fc1.bias', 'text_model.encoder.layers.11.mlp.fc1.weight', 'text_model.encoder.layers.11.mlp.fc2.bias', 'text_model.encoder.layers.11.mlp.fc2.weight', 'text_model.encoder.layers.11.self_attn.k_proj.bias', 'text_model.encoder.layers.11.self_attn.k_proj.weight', 'text_model.encoder.layers.11.self_attn.out_proj.bias', 'text_model.encoder.layers.11.self_attn.out_proj.weight', 'text_model.encoder.layers.11.self_attn.q_proj.bias', 'text_model.encoder.layers.11.self_attn.q_proj.weight', 'text_model.encoder.layers.11.self_attn.v_proj.bias', 'text_model.encoder.layers.11.self_attn.v_proj.weight', 'text_model.encoder.layers.2.layer_norm1.bias', 'text_model.encoder.layers.2.layer_norm1.weight', 'text_model.encoder.layers.2.layer_norm2.bias', 'text_model.encoder.layers.2.layer_norm2.weight', 'text_model.encoder.layers.2.mlp.fc1.bias', 'text_model.encoder.layers.2.mlp.fc1.weight', 'text_model.encoder.layers.2.mlp.fc2.bias', 'text_model.encoder.layers.2.mlp.fc2.weight', 'text_model.encoder.layers.2.self_attn.k_proj.bias', 'text_model.encoder.layers.2.self_attn.k_proj.weight', 'text_model.encoder.layers.2.self_attn.out_proj.bias', 'text_model.encoder.layers.2.self_attn.out_proj.weight', 'text_model.encoder.layers.2.self_attn.q_proj.bias', 'text_model.encoder.layers.2.self_attn.q_proj.weight', 'text_model.encoder.layers.2.self_attn.v_proj.bias', 'text_model.encoder.layers.2.self_attn.v_proj.weight', 'text_model.encoder.layers.3.layer_norm1.bias', 'text_model.encoder.layers.3.layer_norm1.weight', 'text_model.encoder.layers.3.layer_norm2.bias', 'text_model.encoder.layers.3.layer_norm2.weight', 'text_model.encoder.layers.3.mlp.fc1.bias', 'text_model.encoder.layers.3.mlp.fc1.weight', 'text_model.encoder.layers.3.mlp.fc2.bias', 'text_model.encoder.layers.3.mlp.fc2.weight', 'text_model.encoder.layers.3.self_attn.k_proj.bias', 'text_model.encoder.layers.3.self_attn.k_proj.weight', 'text_model.encoder.layers.3.self_attn.out_proj.bias', 'text_model.encoder.layers.3.self_attn.out_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.bias', 'text_model.encoder.layers.3.self_attn.q_proj.weight', 'text_model.encoder.layers.3.self_attn.v_proj.bias', 'text_model.encoder.layers.3.self_attn.v_proj.weight', 'text_model.encoder.layers.4.layer_norm1.bias', 'text_model.encoder.layers.4.layer_norm1.weight', 'text_model.encoder.layers.4.layer_norm2.bias', 'text_model.encoder.layers.4.layer_norm2.weight', 'text_model.encoder.layers.4.mlp.fc1.bias', 'text_model.encoder.layers.4.mlp.fc1.weight', 'text_model.encoder.layers.4.mlp.fc2.bias', 'text_model.encoder.layers.4.mlp.fc2.weight', 'text_model.encoder.layers.4.self_attn.k_proj.bias', 'text_model.encoder.layers.4.self_attn.k_proj.weight', 'text_model.encoder.layers.4.self_attn.out_proj.bias', 'text_model.encoder.layers.4.self_attn.out_proj.weight', 'text_model.encoder.layers.4.self_attn.q_proj.bias', 'text_model.encoder.layers.4.self_attn.q_proj.weight', 'text_model.encoder.layers.4.self_attn.v_proj.bias', 'text_model.encoder.layers.4.self_attn.v_proj.weight', 'text_model.encoder.layers.5.layer_norm1.bias', 'text_model.encoder.layers.5.layer_norm1.weight', 'text_model.encoder.layers.5.layer_norm2.bias', 'text_model.encoder.layers.5.layer_norm2.weight', 'text_model.encoder.layers.5.mlp.fc1.bias', 'text_model.encoder.layers.5.mlp.fc1.weight', 'text_model.encoder.layers.5.mlp.fc2.bias', 'text_model.encoder.layers.5.mlp.fc2.weight', 'text_model.encoder.layers.5.self_attn.k_proj.bias', 'text_model.encoder.layers.5.self_attn.k_proj.weight', 'text_model.encoder.layers.5.self_attn.out_proj.bias', 'text_model.encoder.layers.5.self_attn.out_proj.weight', 'text_model.encoder.layers.5.self_attn.q_proj.bias', 'text_model.encoder.layers.5.self_attn.q_proj.weight', 'text_model.encoder.layers.5.self_attn.v_proj.bias', 'text_model.encoder.layers.5.self_attn.v_proj.weight', 'text_model.encoder.layers.6.layer_norm1.bias', 'text_model.encoder.layers.6.layer_norm1.weight', 'text_model.encoder.layers.6.layer_norm2.bias', 'text_model.encoder.layers.6.layer_norm2.weight', 'text_model.encoder.layers.6.mlp.fc1.bias', 'text_model.encoder.layers.6.mlp.fc1.weight', 'text_model.encoder.layers.6.mlp.fc2.bias', 'text_model.encoder.layers.6.mlp.fc2.weight', 'text_model.encoder.layers.6.self_attn.k_proj.bias', 'text_model.encoder.layers.6.self_attn.k_proj.weight', 'text_model.encoder.layers.6.self_attn.out_proj.bias', 'text_model.encoder.layers.6.self_attn.out_proj.weight', 'text_model.encoder.layers.6.self_attn.q_proj.bias', 'text_model.encoder.layers.6.self_attn.q_proj.weight', 'text_model.encoder.layers.6.self_attn.v_proj.bias', 'text_model.encoder.layers.6.self_attn.v_proj.weight', 'text_model.encoder.layers.7.layer_norm1.bias', 'text_model.encoder.layers.7.layer_norm1.weight', 'text_model.encoder.layers.7.layer_norm2.bias', 'text_model.encoder.layers.7.layer_norm2.weight', 'text_model.encoder.layers.7.mlp.fc1.bias', 'text_model.encoder.layers.7.mlp.fc1.weight', 'text_model.encoder.layers.7.mlp.fc2.bias', 'text_model.encoder.layers.7.mlp.fc2.weight', 'text_model.encoder.layers.7.self_attn.k_proj.bias', 'text_model.encoder.layers.7.self_attn.k_proj.weight', 'text_model.encoder.layers.7.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.out_proj.weight', 'text_model.encoder.layers.7.self_attn.q_proj.bias', 'text_model.encoder.layers.7.self_attn.q_proj.weight', 'text_model.encoder.layers.7.self_attn.v_proj.bias', 'text_model.encoder.layers.7.self_attn.v_proj.weight', 'text_model.encoder.layers.8.layer_norm1.bias', 'text_model.encoder.layers.8.layer_norm1.weight', 'text_model.encoder.layers.8.layer_norm2.bias', 'text_model.encoder.layers.8.layer_norm2.weight', 'text_model.encoder.layers.8.mlp.fc1.bias', 'text_model.encoder.layers.8.mlp.fc1.weight', 'text_model.encoder.layers.8.mlp.fc2.bias', 'text_model.encoder.layers.8.mlp.fc2.weight', 'text_model.encoder.layers.8.self_attn.k_proj.bias', 'text_model.encoder.layers.8.self_attn.k_proj.weight', 'text_model.encoder.layers.8.self_attn.out_proj.bias', 'text_model.encoder.layers.8.self_attn.out_proj.weight', 'text_model.encoder.layers.8.self_attn.q_proj.bias', 'text_model.encoder.layers.8.self_attn.q_proj.weight', 'text_model.encoder.layers.8.self_attn.v_proj.bias', 'text_model.encoder.layers.8.self_attn.v_proj.weight', 'text_model.encoder.layers.9.layer_norm1.bias', 'text_model.encoder.layers.9.layer_norm1.weight', 'text_model.encoder.layers.9.layer_norm2.bias', 'text_model.encoder.layers.9.layer_norm2.weight', 'text_model.encoder.layers.9.mlp.fc1.bias', 'text_model.encoder.layers.9.mlp.fc1.weight', 'text_model.encoder.layers.9.mlp.fc2.bias', 'text_model.encoder.layers.9.mlp.fc2.weight', 'text_model.encoder.layers.9.self_attn.k_proj.bias', 'text_model.encoder.layers.9.self_attn.k_proj.weight', 'text_model.encoder.layers.9.self_attn.out_proj.bias', 'text_model.encoder.layers.9.self_attn.out_proj.weight', 'text_model.encoder.layers.9.self_attn.q_proj.bias', 'text_model.encoder.layers.9.self_attn.q_proj.weight', 'text_model.encoder.layers.9.self_attn.v_proj.bias', 'text_model.encoder.layers.9.self_attn.v_proj.weight', 'text_model.final_layer_norm.bias', 'text_model.final_layer_norm.weight', 'text_model.head.bias', 'text_model.head.weight', 'vision_model.embeddings.patch_embedding.bias', 'vision_model.embeddings.patch_embedding.weight', 'vision_model.embeddings.position_embedding.weight', 'vision_model.encoder.layers.0.layer_norm1.bias', 'vision_model.encoder.layers.0.layer_norm1.weight', 'vision_model.encoder.layers.0.layer_norm2.bias', 'vision_model.encoder.layers.0.layer_norm2.weight', 'vision_model.encoder.layers.0.mlp.fc1.bias', 'vision_model.encoder.layers.0.mlp.fc1.weight', 'vision_model.encoder.layers.0.mlp.fc2.bias', 'vision_model.encoder.layers.0.mlp.fc2.weight', 'vision_model.encoder.layers.0.self_attn.k_proj.bias', 'vision_model.encoder.layers.0.self_attn.k_proj.weight', 'vision_model.encoder.layers.0.self_attn.out_proj.bias', 'vision_model.encoder.layers.0.self_attn.out_proj.weight', 'vision_model.encoder.layers.0.self_attn.q_proj.bias', 'vision_model.encoder.layers.0.self_attn.q_proj.weight', 'vision_model.encoder.layers.0.self_attn.v_proj.bias', 'vision_model.encoder.layers.0.self_attn.v_proj.weight', 'vision_model.encoder.layers.1.layer_norm1.bias', 'vision_model.encoder.layers.1.layer_norm1.weight', 'vision_model.encoder.layers.1.layer_norm2.bias', 'vision_model.encoder.layers.1.layer_norm2.weight', 'vision_model.encoder.layers.1.mlp.fc1.bias', 'vision_model.encoder.layers.1.mlp.fc1.weight', 'vision_model.encoder.layers.1.mlp.fc2.bias', 'vision_model.encoder.layers.1.mlp.fc2.weight', 'vision_model.encoder.layers.1.self_attn.k_proj.bias', 'vision_model.encoder.layers.1.self_attn.k_proj.weight', 'vision_model.encoder.layers.1.self_attn.out_proj.bias', 'vision_model.encoder.layers.1.self_attn.out_proj.weight', 'vision_model.encoder.layers.1.self_attn.q_proj.bias', 'vision_model.encoder.layers.1.self_attn.q_proj.weight', 'vision_model.encoder.layers.1.self_attn.v_proj.bias', 'vision_model.encoder.layers.1.self_attn.v_proj.weight', 'vision_model.encoder.layers.10.layer_norm1.bias', 'vision_model.encoder.layers.10.layer_norm1.weight', 'vision_model.encoder.layers.10.layer_norm2.bias', 'vision_model.encoder.layers.10.layer_norm2.weight', 'vision_model.encoder.layers.10.mlp.fc1.bias', 'vision_model.encoder.layers.10.mlp.fc1.weight', 'vision_model.encoder.layers.10.mlp.fc2.bias', 'vision_model.encoder.layers.10.mlp.fc2.weight', 'vision_model.encoder.layers.10.self_attn.k_proj.bias', 'vision_model.encoder.layers.10.self_attn.k_proj.weight', 'vision_model.encoder.layers.10.self_attn.out_proj.bias', 'vision_model.encoder.layers.10.self_attn.out_proj.weight', 'vision_model.encoder.layers.10.self_attn.q_proj.bias', 'vision_model.encoder.layers.10.self_attn.q_proj.weight', 'vision_model.encoder.layers.10.self_attn.v_proj.bias', 'vision_model.encoder.layers.10.self_attn.v_proj.weight', 'vision_model.encoder.layers.11.layer_norm1.bias', 'vision_model.encoder.layers.11.layer_norm1.weight', 'vision_model.encoder.layers.11.layer_norm2.bias', 'vision_model.encoder.layers.11.layer_norm2.weight', 'vision_model.encoder.layers.11.mlp.fc1.bias', 'vision_model.encoder.layers.11.mlp.fc1.weight', 'vision_model.encoder.layers.11.mlp.fc2.bias', 'vision_model.encoder.layers.11.mlp.fc2.weight', 'vision_model.encoder.layers.11.self_attn.k_proj.bias', 'vision_model.encoder.layers.11.self_attn.k_proj.weight', 'vision_model.encoder.layers.11.self_attn.out_proj.bias', 'vision_model.encoder.layers.11.self_attn.out_proj.weight', 'vision_model.encoder.layers.11.self_attn.q_proj.bias', 'vision_model.encoder.layers.11.self_attn.q_proj.weight', 'vision_model.encoder.layers.11.self_attn.v_proj.bias', 'vision_model.encoder.layers.11.self_attn.v_proj.weight', 'vision_model.encoder.layers.2.layer_norm1.bias', 'vision_model.encoder.layers.2.layer_norm1.weight', 'vision_model.encoder.layers.2.layer_norm2.bias', 'vision_model.encoder.layers.2.layer_norm2.weight', 'vision_model.encoder.layers.2.mlp.fc1.bias', 'vision_model.encoder.layers.2.mlp.fc1.weight', 'vision_model.encoder.layers.2.mlp.fc2.bias', 'vision_model.encoder.layers.2.mlp.fc2.weight', 'vision_model.encoder.layers.2.self_attn.k_proj.bias', 'vision_model.encoder.layers.2.self_attn.k_proj.weight', 'vision_model.encoder.layers.2.self_attn.out_proj.bias', 'vision_model.encoder.layers.2.self_attn.out_proj.weight', 'vision_model.encoder.layers.2.self_attn.q_proj.bias', 'vision_model.encoder.layers.2.self_attn.q_proj.weight', 'vision_model.encoder.layers.2.self_attn.v_proj.bias', 'vision_model.encoder.layers.2.self_attn.v_proj.weight', 'vision_model.encoder.layers.3.layer_norm1.bias', 'vision_model.encoder.layers.3.layer_norm1.weight', 'vision_model.encoder.layers.3.layer_norm2.bias', 'vision_model.encoder.layers.3.layer_norm2.weight', 'vision_model.encoder.layers.3.mlp.fc1.bias', 'vision_model.encoder.layers.3.mlp.fc1.weight', 'vision_model.encoder.layers.3.mlp.fc2.bias', 'vision_model.encoder.layers.3.mlp.fc2.weight', 'vision_model.encoder.layers.3.self_attn.k_proj.bias', 'vision_model.encoder.layers.3.self_attn.k_proj.weight', 'vision_model.encoder.layers.3.self_attn.out_proj.bias', 'vision_model.encoder.layers.3.self_attn.out_proj.weight', 'vision_model.encoder.layers.3.self_attn.q_proj.bias', 'vision_model.encoder.layers.3.self_attn.q_proj.weight', 'vision_model.encoder.layers.3.self_attn.v_proj.bias', 'vision_model.encoder.layers.3.self_attn.v_proj.weight', 'vision_model.encoder.layers.4.layer_norm1.bias', 'vision_model.encoder.layers.4.layer_norm1.weight', 'vision_model.encoder.layers.4.layer_norm2.bias', 'vision_model.encoder.layers.4.layer_norm2.weight', 'vision_model.encoder.layers.4.mlp.fc1.bias', 'vision_model.encoder.layers.4.mlp.fc1.weight', 'vision_model.encoder.layers.4.mlp.fc2.bias', 'vision_model.encoder.layers.4.mlp.fc2.weight', 'vision_model.encoder.layers.4.self_attn.k_proj.bias', 'vision_model.encoder.layers.4.self_attn.k_proj.weight', 'vision_model.encoder.layers.4.self_attn.out_proj.bias', 'vision_model.encoder.layers.4.self_attn.out_proj.weight', 'vision_model.encoder.layers.4.self_attn.q_proj.bias', 'vision_model.encoder.layers.4.self_attn.q_proj.weight', 'vision_model.encoder.layers.4.self_attn.v_proj.bias', 'vision_model.encoder.layers.4.self_attn.v_proj.weight', 'vision_model.encoder.layers.5.layer_norm1.bias', 'vision_model.encoder.layers.5.layer_norm1.weight', 'vision_model.encoder.layers.5.layer_norm2.bias', 'vision_model.encoder.layers.5.layer_norm2.weight', 'vision_model.encoder.layers.5.mlp.fc1.bias', 'vision_model.encoder.layers.5.mlp.fc1.weight', 'vision_model.encoder.layers.5.mlp.fc2.bias', 'vision_model.encoder.layers.5.mlp.fc2.weight', 'vision_model.encoder.layers.5.self_attn.k_proj.bias', 'vision_model.encoder.layers.5.self_attn.k_proj.weight', 'vision_model.encoder.layers.5.self_attn.out_proj.bias', 'vision_model.encoder.layers.5.self_attn.out_proj.weight', 'vision_model.encoder.layers.5.self_attn.q_proj.bias', 'vision_model.encoder.layers.5.self_attn.q_proj.weight', 'vision_model.encoder.layers.5.self_attn.v_proj.bias', 'vision_model.encoder.layers.5.self_attn.v_proj.weight', 'vision_model.encoder.layers.6.layer_norm1.bias', 'vision_model.encoder.layers.6.layer_norm1.weight', 'vision_model.encoder.layers.6.layer_norm2.bias', 'vision_model.encoder.layers.6.layer_norm2.weight', 'vision_model.encoder.layers.6.mlp.fc1.bias', 'vision_model.encoder.layers.6.mlp.fc1.weight', 'vision_model.encoder.layers.6.mlp.fc2.bias', 'vision_model.encoder.layers.6.mlp.fc2.weight', 'vision_model.encoder.layers.6.self_attn.k_proj.bias', 'vision_model.encoder.layers.6.self_attn.k_proj.weight', 'vision_model.encoder.layers.6.self_attn.out_proj.bias', 'vision_model.encoder.layers.6.self_attn.out_proj.weight', 'vision_model.encoder.layers.6.self_attn.q_proj.bias', 'vision_model.encoder.layers.6.self_attn.q_proj.weight', 'vision_model.encoder.layers.6.self_attn.v_proj.bias', 'vision_model.encoder.layers.6.self_attn.v_proj.weight', 'vision_model.encoder.layers.7.layer_norm1.bias', 'vision_model.encoder.layers.7.layer_norm1.weight', 'vision_model.encoder.layers.7.layer_norm2.bias', 'vision_model.encoder.layers.7.layer_norm2.weight', 'vision_model.encoder.layers.7.mlp.fc1.bias', 'vision_model.encoder.layers.7.mlp.fc1.weight', 'vision_model.encoder.layers.7.mlp.fc2.bias', 'vision_model.encoder.layers.7.mlp.fc2.weight', 'vision_model.encoder.layers.7.self_attn.k_proj.bias', 'vision_model.encoder.layers.7.self_attn.k_proj.weight', 'vision_model.encoder.layers.7.self_attn.out_proj.bias', 'vision_model.encoder.layers.7.self_attn.out_proj.weight', 'vision_model.encoder.layers.7.self_attn.q_proj.bias', 'vision_model.encoder.layers.7.self_attn.q_proj.weight', 'vision_model.encoder.layers.7.self_attn.v_proj.bias', 'vision_model.encoder.layers.7.self_attn.v_proj.weight', 'vision_model.encoder.layers.8.layer_norm1.bias', 'vision_model.encoder.layers.8.layer_norm1.weight', 'vision_model.encoder.layers.8.layer_norm2.bias', 'vision_model.encoder.layers.8.layer_norm2.weight', 'vision_model.encoder.layers.8.mlp.fc1.bias', 'vision_model.encoder.layers.8.mlp.fc1.weight', 'vision_model.encoder.layers.8.mlp.fc2.bias', 'vision_model.encoder.layers.8.mlp.fc2.weight', 'vision_model.encoder.layers.8.self_attn.k_proj.bias', 'vision_model.encoder.layers.8.self_attn.k_proj.weight', 'vision_model.encoder.layers.8.self_attn.out_proj.bias', 'vision_model.encoder.layers.8.self_attn.out_proj.weight', 'vision_model.encoder.layers.8.self_attn.q_proj.bias', 'vision_model.encoder.layers.8.self_attn.q_proj.weight', 'vision_model.encoder.layers.8.self_attn.v_proj.bias', 'vision_model.encoder.layers.8.self_attn.v_proj.weight', 'vision_model.encoder.layers.9.layer_norm1.bias', 'vision_model.encoder.layers.9.layer_norm1.weight', 'vision_model.encoder.layers.9.layer_norm2.bias', 'vision_model.encoder.layers.9.layer_norm2.weight', 'vision_model.encoder.layers.9.mlp.fc1.bias', 'vision_model.encoder.layers.9.mlp.fc1.weight', 'vision_model.encoder.layers.9.mlp.fc2.bias', 'vision_model.encoder.layers.9.mlp.fc2.weight', 'vision_model.encoder.layers.9.self_attn.k_proj.bias', 'vision_model.encoder.layers.9.self_attn.k_proj.weight', 'vision_model.encoder.layers.9.self_attn.out_proj.bias', 'vision_model.encoder.layers.9.self_attn.out_proj.weight', 'vision_model.encoder.layers.9.self_attn.q_proj.bias', 'vision_model.encoder.layers.9.self_attn.q_proj.weight', 'vision_model.encoder.layers.9.self_attn.v_proj.bias', 'vision_model.encoder.layers.9.self_attn.v_proj.weight', 'vision_model.head.attention.in_proj_bias', 'vision_model.head.attention.in_proj_weight', 'vision_model.head.attention.out_proj.bias', 'vision_model.head.attention.out_proj.weight', 'vision_model.head.layernorm.bias', 'vision_model.head.layernorm.weight', 'vision_model.head.mlp.fc1.bias', 'vision_model.head.mlp.fc1.weight', 'vision_model.head.mlp.fc2.bias', 'vision_model.head.mlp.fc2.weight', 'vision_model.head.probe', 'vision_model.post_layernorm.bias', 'vision_model.post_layernorm.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "from PIL import Image\n",
    "import requests\n",
    "from transformers import AutoProcessor, AutoModel, AutoTokenizer\n",
    "import torch\n",
    "processor = AutoProcessor.from_pretrained(\"/home/user/wyf/train_siglip_from_scratch/vit-base-patch16-224\")\n",
    "tokenizer = AutoTokenizer.from_pretrained('/home/user/wyf/chinese-roberta-wwm-ext')\n",
    "model = AutoModel.from_pretrained('/home/user/wyf/train_siglip_from_scratch/outputs')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([[0.5025, 0.5097]])\n",
      "50.2% that image 0 is '两只猫在玩耍'\n",
      "51.0% that image 0 is '一颗树'\n"
     ]
    }
   ],
   "source": [
    "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n",
    "image = Image.open(requests.get(url, stream=True).raw)\n",
    "# image = Image.open('/home/user/wyf/train_multimodal_from_scratch/test_images/th1.png')\n",
    "texts = [\"两只猫在玩耍\", '一颗树']\n",
    "# important: we pass `padding=max_length` since the model was trained with this\n",
    "input_ids = tokenizer(texts, padding=\"max_length\", return_tensors=\"pt\", max_length=64)['input_ids']\n",
    "attention_mask = tokenizer(texts, padding=\"max_length\", return_tensors=\"pt\", max_length=64)['attention_mask']\n",
    "pixel_values = processor(images=image, return_tensors=\"pt\")['pixel_values']\n",
    "model.eval()\n",
    "with torch.no_grad():\n",
    "    outputs = model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)\n",
    "\n",
    "logits_per_image = outputs.logits_per_image\n",
    "probs = torch.sigmoid(logits_per_image) # these are the probabilities\n",
    "print(probs)\n",
    "print(f\"{probs[0][0]:.1%} that image 0 is '{texts[0]}'\")\n",
    "print(f\"{probs[0][1]:.1%} that image 0 is '{texts[1]}'\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "wyf",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
